Compile for gfx908 and gfx90a (#130)

* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean

Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
cd167e49 · Chao Liu · GitHub · ecf337ba · cd167e49 · cd167e49
Unverified Commit cd167e49 authored Mar 31, 2022 by Chao Liu Committed by GitHub Mar 31, 2022
20 changed files
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -182,7 +182,7 @@ pipeline {
                {
                    agent { label rocmnode("nogpu")}
                    environment{
-                        setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "ckProfiler", no_reboot:true, build_type: 'Release')
@@ -192,7 +192,7 @@ pipeline {
                {
                    agent { label rocmnode("nogpu")}
                    environment{
-                        setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
                    }
                    steps{
                        // until we stabilize debug build due to compiler crashes
@@ -228,7 +228,7 @@ pipeline {
                {
                    agent{ label rocmnode("gfx908")}
                    environment{
-                        setup_args = """ -D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 " -DBUILD_DEV=On """
+                        setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')

--- a/README.md
+++ b/README.md
+## Docker script
+```bash
+docker run                                     \
+-it                                            \
+--privileged                                   \
+--group-add sudo                               \
+-w /root/workspace                             \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace  \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev            \
+/bin/bash
+```
+## Build
+```bash
+mkdir build && cd build
+```
+```bash
+# Need to specify target ID, example below is gfx908 and gfx90a
+cmake                                                                 \
+-D BUILD_DEV=OFF                                                      \
+-D CMAKE_BUILD_TYPE=Release                                           \
+-D CMAKE_CXX_FLAGS=" --offload-arch=gfx908 --offload-arch=gfx90a -O3  \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                        \
+..
+```
+### Build and Run Examples
+```bash
+ make -j examples
+```
+Instructions for running each individual examples are under ```example/```
+## Tests
+```bash
+ make -j tests
+ make test
+```
+## Build ckProfiler
+```bash
+ make -j ckProfiler
+```
+Instructions for running ckProfiler are under ```profiler/```
--- a/example/01_gemm/README.md
+++ b/example/01_gemm/README.md
-# Instructions for ```gemm_xdl``` Example
+# Instructions for ```example_gemm_xdl```
-## Docker script
+## Run ```example_gemm_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```gemm_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j gemm_xdl
-```
-## Run ```gemm_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
-./example/gemm_xdl 0 1 5
+./bin/example_gemm_xdl 0 1 5
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)

--- a/example/01_gemm/gemm_xdl_fp16.cpp
+++ b/example/01_gemm/gemm_xdl_fp16.cpp
@@ -40,7 +40,7 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle

--- a/example/02_gemm_alpha_beta/README.md
+++ b/example/02_gemm_alpha_beta/README.md
-# Instructions for ```gemm_xdl_alpha_beta``` Example
+# Instructions for ```example_gemm_xdl_alpha_beta```
-## Docker script
+## Run ```example_gemm_xdl_alpha_beta```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```gemm_xdl_alpha_beta```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j gemm_xdl_alpha_beta
-```
-## Run ```gemm_xdl_alpha_beta```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
-./example/gemm_xdl_alpha_beta 1 1 1 0.5 0.5
+./bin/example_gemm_xdl_alpha_beta 1 1 1 0.5 0.5
 ```
 Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16)
 ```

--- a/example/03_gemm_bias_relu/README.md
+++ b/example/03_gemm_bias_relu/README.md
-# Instructions for ```gemm_xdl_bias_relu_add``` Example
+# Instructions for ```example_gemm_xdl_bias_relu_add```
-## Docker script
+## Run ```example_gemm_xdl_bias_relu_add```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```gemm_xdl_bias_relu_add```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j gemm_xdl_bias_relu_add
-```
-## Run ```gemm_xdl_bias_relu_add```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
-./example/gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
+./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)

--- a/example/04_gemm_bias_relu_add/README.md
+++ b/example/04_gemm_bias_relu_add/README.md
-# Instructions for ```gemm_xdl_bias_relu_add``` Example
+# Instructions for ```example_gemm_xdl_bias_relu_add```
-## Docker script
+## Run ```example_gemm_xdl_bias_relu_add```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```gemm_xdl_bias_relu_add```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j gemm_xdl_bias_relu_add
-```
-## Run ```gemm_xdl_bias_relu_add```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC
-./example/gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
+./bin/example_gemm_xdl_bias_relu_add 0 1 5 3840 4096 4096 4096 4096 4096
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)

--- a/example/05_conv2d_fwd/README.md
+++ b/example/05_conv2d_fwd/README.md
-# Instructions for ```conv2d_fwd_xdl``` Example
+# Instructions for ```example_conv2d_fwd_xdl```
-## Docker script
+## Run ```example_conv2d_fwd_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv2d_fwd_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv2d_fwd_xdl
-```
-## Run ```conv2d_fwd_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv2d_fwd_xdl 0 1 5
+./bin/example_conv2d_fwd_xdl 0 1 5
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)

--- a/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp
+++ b/example/05_conv2d_fwd/conv2d_fwd_xdl_fp16.cpp
@@ -34,7 +34,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 using DeviceConvFwdInstance = ck::tensor_operation::device::
    DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<

--- a/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp
+++ b/example/05_conv2d_fwd/conv2d_fwd_xdl_int8.cpp
@@ -35,7 +35,7 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 using DeviceConvFwdInstance = ck::tensor_operation::device::
    DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<

--- a/example/06_conv2d_fwd_bias_relu/README.md
+++ b/example/06_conv2d_fwd_bias_relu/README.md
-# Instructions for ```conv_xdl_bias_relu_add``` Example
+# Instructions for ```example_conv_xdl_bias_relu```
-## Docker script
+## Run ```example_conv_xdl_bias_relu```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv_xdl_bias_relu_add```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv_xdl_bias_relu_add
-```
-## Run ```conv_xdl_bias_relu_add```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv_xdl_bias_relu_add 0 1 5
+./bin/example_conv_xdl_bias_relu 0 1 5
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
@@ -48,14 +15,8 @@ in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
 wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
 out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
 bias_k: dim 1, lengths {256}, strides {1}
-resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-arg.c0_grid_desc_m_n_{ 165888, 256}
-arg.c1_grid_desc_m_n_{ 165888, 256}
 launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
 Warm up
 Start running 5 times...
-Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
+Perf: 1.39009 ms, 105.581 TFlops, 239.981 GB/s
 ```
--- a/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
+++ b/example/06_conv2d_fwd_bias_relu/conv2d_fwd_xdl_bias_relu.cpp
@@ -32,10 +32,10 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
-static constexpr auto MemorySet = ck::InMemoryDataOperationEnum_t::Set;
+static constexpr auto MemorySet = ck::InMemoryDataOperationEnum::Set;
 static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::

--- a/example/07_conv2d_fwd_bias_relu_add/README.md
+++ b/example/07_conv2d_fwd_bias_relu_add/README.md
-# Instructions for ```conv_xdl_bias_relu_add``` Example
+# Instructions for ```example_conv_xdl_bias_relu_add```
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv_xdl_bias_relu_add```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv_xdl_bias_relu_add
-```
-## Run ```conv_xdl_bias_relu_add```
+## Run ```example_conv_xdl_bias_relu_add```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./example/conv_xdl_bias_relu_add 0 1 5
+./bin/example_conv_xdl_bias_relu_add 0 1 5
 ```
 Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
@@ -49,13 +17,8 @@ wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
 out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
 bias_k: dim 1, lengths {256}, strides {1}
 resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
-arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
-arg.b_grid_desc_k0_n_k1_{216, 256, 8}
-arg.c_grid_desc_m_n_{ 165888, 256}
-arg.c0_grid_desc_m_n_{ 165888, 256}
-arg.c1_grid_desc_m_n_{ 165888, 256}
 launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
 Warm up
 Start running 5 times...
-Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
+Perf: 1.44711 ms, 101.421 TFlops, 289.218 GB/s
 ```
--- a/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
+++ b/example/07_conv2d_fwd_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp
@@ -33,7 +33,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd;
 static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::

--- a/example/08_conv3d_fwd/README.md
+++ b/example/08_conv3d_fwd/README.md
-# Instructions for ```conv3d_fwd_xdl``` Example
+# Instructions for ```example_conv3d_fwd_xdl```
-## Docker script
+## Run ```example_conv3d_fwd_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv3d_fwd_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv3d_fwd_xdl
-```
-## Run ```conv3d_fwd_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 24: N, K, C, Z, Y, X, Di, Hi, Wi, Sz, Sy, Sx, Dz, Dy, Dx, leftPz, LeftPy, LeftPx, RightPz, RightPy, RightPx
-./example/conv3d_fwd_xdl 0 1 5
+./bin/example_conv3d_fwd_xdl 0 1 5
 ```
-Result (MI100 dynamic frequency)
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
 ```
-in: dim 5, lengths {4, 71, 71, 71, 192}, strides {68718912, 967872, 13632, 192, 1}
 wei: dim 5, lengths {256, 3, 3, 3, 192}, strides {5184, 1728, 576, 192, 1}
 out: dim 5, lengths {4, 36, 36, 36, 256}, strides {11943936, 331776, 9216, 256, 1}
-a_grid_desc_b_k0_m_k1{1, 648, 186624, 8}
+num_batches_of_GEMM = 1
-b_grid_desc_b_k0_n_k1{1, 648, 256, 8}
+a_grid_desc_k0_m_k1{648, 186624, 8}
+b_grid_desc_k0_n_k1{648, 256, 8}
+c_grid_desc_m_n{ 186624, 256}
 launch_and_time_kernel: grid_dim {1458, 1, 1}, block_dim {256, 1, 1}
 Warm up
 Start running 5 times...
-Perf: 4.49466 ms, 110.206 TFlops, 144.161 GB/s
+Perf: 4.58795 ms, 107.965 TFlops, 141.23 GB/s
 ```
--- a/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp
+++ b/example/08_conv3d_fwd/conv3d_fwd_xdl.cpp
@@ -37,7 +37,7 @@ using WeiLayout = ck::tensor_layout::convolution::KZYXC;
 using OutLayout = ck::tensor_layout::convolution::NDHWK;
 static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 using DeviceConv3dFwdInstance = ck::tensor_operation::device::
    DeviceConv3dFwdXdl_Input_N_Di_Hi_Wi_C_Weight_K_Z_Y_X_C_Output_N_Do_Ho_Wo_K<

--- a/example/09_convnd_fwd/README.md
+++ b/example/09_convnd_fwd/README.md
-# Instructions for ```convnd_fwd_xdl``` Example
+# Instructions for ```example_convnd_fwd_xdl```
-## Docker script
+## Run ```example_convnd_fwd_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```convnd_fwd_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j convnd_fwd_xdl
-```
-## Run ```convnd_fwd_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
@@ -47,7 +14,7 @@ cmake                                                                  \
 # <dilations>, (ie Dy, Dx for 2D)
 # <left padding>, (ie LeftPy, LeftPx for 2D)
 # <right padding>, (ie RightPy, RightPx for 2D)
-./example/convnd_fwd_xdl 0 1 100
+./bin/example_convnd_fwd_xdl 0 1 100
 ```
 Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32)

--- a/example/09_convnd_fwd/convnd_fwd_xdl.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl.cpp
@@ -26,7 +26,7 @@ using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvFwdDefault =
-    ck::tensor_operation::device::ConvolutionForwardSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
 using DeviceConvFwdBasePtr =
    ck::tensor_operation::device::DeviceConvFwdPtr<InElementOp, WeiElementOp, OutElementOp>;

--- a/example/10_conv2d_bwd_data/README.md
+++ b/example/10_conv2d_bwd_data/README.md
-# Instructions for ```conv2d_bwd_data_xdl``` Example
+# Instructions for ```example_conv2d_bwd_data_xdl``` Example
-## Docker script
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv2d_bwd_data_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv2d_bwd_data_xdl
-```
-## Run ```conv2d_bwd_data_xdl```
+## Run ```example_conv2d_bwd_data_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
-./bin/conv2d_bwd_data_xdl 0 1 5
+./bin/example_conv2d_bwd_data_xdl 0 1 5
 ```
 Result

--- a/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
+++ b/example/10_conv2d_bwd_data/conv2d_bwd_data_xdl.cpp
@@ -27,7 +27,7 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvBwdDefault =
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
 using DeviceConvBwdDataInstance = ck::tensor_operation::device::
    DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<
@@ -38,7 +38,7 @@ using DeviceConvBwdDataInstance = ck::tensor_operation::device::
        InElementOp,    // InElementwiseOperation
        WeiElementOp,   // WeiElementwiseOperation
        OutElementOp,   // OutElementwiseOperation
-        ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
+        ConvBwdDefault, // ConvolutionBackwardDataSpecialization
        256,            // BlockSize
        128,            // MPerBlock
        128,            // NPerBlock