Compile for gfx908 and gfx90a (#130)

* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean

Compile for gfx908 and gfx90a (#130)
* adding compilation for multiple targets * fix build * clean * update Jekinsfile * update readme * update Jenkins * use ck::half_t instead of ushort for bf16 * rename enum classes * clean * rename * clean
cd167e49 · Chao Liu · GitHub · ecf337ba · cd167e49 · cd167e49
Unverified Commit cd167e49 authored Mar 31, 2022 by Chao Liu Committed by GitHub Mar 31, 2022
20 changed files
--- a/example/11_conv2d_bwd_wgt/README.md
+++ b/example/11_conv2d_bwd_wgt/README.md
-# Instructions for ```conv2d_wrw_xdl``` Example
+# Instructions for ```example_conv2d_wrw_xdl``` Example
-## Docker script
+## Run ```example_conv2d_wrw_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```conv2d_wrw_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j conv2d_wrw_xdl
-```
-## Run ```conv2d_wrw_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)

--- a/example/12_reduce/README.md
+++ b/example/12_reduce/README.md
-# Instructions for ```reduce_blockwise``` Example
+# Instructions for ```example_reduce_blockwise```
-## Docker script
+## Run ```example_reduce_blockwise```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```reduce_blockwise```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j reduce_blockwise 
-```
-## Run ```reduce_blockwise```
 ```bash
 # -D <xxx> : input 4-d tensor lengths
 # -v <x> :   verification (0=no, 1=yes)
 #arg1: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg2: run kernel # of times (>1)
-./bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
+./bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
 ```
 Result
@@ -50,7 +17,7 @@ Start running 3 times...
 Perf: 0.23536 ms, 267.32 GB/s, DeviceReduceBlockWise<256,M_C4_S1,K_C64_S1,InSrcVectorDim_0_InSrcVectorSize_1_OutDstVectorSize_1>
 error: 0
 max_diff: 0, 529, 529
-root@dc-smc-18:/data/composable_kernel/Build3# bin/reduce_blockwise -D 16,64,32,960 -v 1 1 10
+root@dc-smc-18:/data/composable_kernel/Build3# bin/example_reduce_blockwise -D 16,64,32,960 -v 1 1 10
 launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} 
 Warm up
 Start running 10 times...

--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -32,10 +32,10 @@ using HostAccDataType = float;
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;
-constexpr ReduceTensorOp_t ReduceOpId = ReduceTensorOp_t::NORM2;
+constexpr ReduceTensorOp ReduceOpId = ReduceTensorOp::NORM2;
-constexpr NanPropagation_t NanOpt     = NanPropagation_t::PROPAGATE_NAN;
+constexpr NanPropagation NanOpt     = NanPropagation::PROPAGATE_NAN;
-constexpr bool PropagateNan = (NanOpt == NanPropagation_t::NOT_PROPAGATE_NAN) ? false : true;
+constexpr bool PropagateNan         = (NanOpt == NanPropagation::NOT_PROPAGATE_NAN) ? false : true;
-constexpr ReduceTensorIndices_t IndicesOpt = ReduceTensorIndices_t::NO_INDICES;
+constexpr ReduceTensorIndices IndicesOpt = ReduceTensorIndices::NO_INDICES;
 using ReduceOperation = typename reduce_binary_operator<AccDataType, ReduceOpId>::opType;
 using InElementwiseOperation =
@@ -210,11 +210,11 @@ int main(int argc, char* argv[])
        return (-1);
    constexpr bool op_support_indices =
-        (ReduceOpId == ReduceTensorOp_t::MIN || ReduceOpId == ReduceTensorOp_t::MAX ||
+        (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX ||
-         ReduceOpId == ReduceTensorOp_t::AMAX);
+         ReduceOpId == ReduceTensorOp::AMAX);
    constexpr bool NeedIndices =
-        (op_support_indices && (IndicesOpt != ReduceTensorIndices_t::NO_INDICES));
+        (op_support_indices && (IndicesOpt != ReduceTensorIndices::NO_INDICES));
    // if input is half type, no reason to use float for indiced reduction operation and must use
    // float for non-indiced reduction operation for accuracy
@@ -230,7 +230,7 @@ int main(int argc, char* argv[])
    // indices option can only be used when it is really needed
    constexpr bool invalid_reduce_3 =
-        (!op_support_indices && IndicesOpt != ReduceTensorIndices_t::NO_INDICES);
+        (!op_support_indices && IndicesOpt != ReduceTensorIndices::NO_INDICES);
    constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3);

--- a/example/13_pool2d_fwd/README.md
+++ b/example/13_pool2d_fwd/README.md
-# Instructions for ```pool2d_fwd``` Example
+# Instructions for ```example_pool2d_fwd``` Example
-## Docker script
+## Run ```example_pool2d_fwd```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```pool2d_fwd```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j pool2d_fwd
-```
-## Run ```pool2d_fwd```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=single integer value, 2=scope integer value, 3=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4 to 15: N, C, Y, X, Hi, Wi, Sy, Sx, LeftPy, LeftPx, RightPy, RightPx
-./example/pool2d_fwd 1 1 10
+./bin/example_pool2d_fwd 1 1 10
 ```
 Result 

--- a/example/13_pool2d_fwd/pool2d_fwd.cpp
+++ b/example/13_pool2d_fwd/pool2d_fwd.cpp
@@ -22,9 +22,9 @@ using InLayout  = ck::tensor_layout::convolution::NHWC;
 using OutLayout = ck::tensor_layout::convolution::NHWC;
 #if 1
-static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::MAX;
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX;
 #else
-static constexpr auto ReduceOpId = ck::ReduceTensorOp_t::AVG;
+static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG;
 #endif
 static constexpr bool NeedIndices  = false;
@@ -47,7 +47,7 @@ using DevicePoolFwdInstance =
 template <typename InDataType,
          typename OutDataType,
          typename AccDataType,
-          ck::ReduceTensorOp_t ReduceOpId,
+          ck::ReduceTensorOp ReduceOpId,
          bool PropagateNan,
          bool NeedIndices>
 static void pool_host_verify(const Tensor<InDataType>& in,

--- a/example/15_grouped_gemm/README.md
+++ b/example/15_grouped_gemm/README.md
-# Instructions for ```grouped_gemm_xdl``` Example
+# Instructions for ```example_grouped_gemm_xdl```
-## Docker script
+## Run ```example_grouped_gemm_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```grouped_gemm_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j example_grouped_gemm_xdl_fp16
-```
-## Run ```grouped_gemm_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)

--- a/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
+++ b/example/15_grouped_gemm/grouped_gemm_xdl_fp16.cpp
@@ -40,9 +40,9 @@ using AElementOp = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp = ck::tensor_operation::element_wise::PassThrough;
-static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization_t::Default;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
 // static constexpr auto GemmMNPadding =
-// ck::tensor_operation::device::GemmSpecialization_t::MNPadding;
+// ck::tensor_operation::device::GemmSpecialization::MNPadding;
 // clang-format off
 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGroupedGemmXdl

--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
 using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
 static constexpr auto GemmSpecialization =
-    ck::tensor_operation::device::GemmSpecialization_t::Default;
+    ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
 using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle

--- a/example/17_convnd_bwd_data_xdl/README.md
+++ b/example/17_convnd_bwd_data_xdl/README.md
-# Instructions for ```convnd_bwd_data_xdl``` Example
+# Instructions for ```example_convnd_bwd_data_xdl```
-## Docker script
+## Run ```example_example_convnd_bwd_data_xdl```
-```bash
-docker run                                                                   \
-it                                                                          \
--rm                                                                         \
--privileged                                                                 \
--group-add sudo                                                             \
-w /root/workspace                                                           \
-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
-rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
-/bin/bash
-```
-## Build ```convnd_bwd_data_xdl```
-```bash
-mkdir build && cd build
-```
-```bash
-# Need to specify target ID, example below is gfx908
-cmake                                                                  \
-D BUILD_DEV=OFF                                                       \
-D CMAKE_BUILD_TYPE=Release                                            \
-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
-..
-```
-```bash
- make -j convnd_bwd_data_xdl
-```
-## Run ```example_convnd_bwd_data_xdl```
 ```bash
 #arg1: verification (0=no, 1=yes)
 #arg2: initialization (0=no init, 1=integer value, 2=decimal value)
 #arg3: run kernel # of times (>1)
 #arg4: num_dim_spatial(1|2|3)
 #arg5 to ...: N, K, C, [Z,] [Y,] X, [Di,] [Hi,] Wi, S[z,] [Sy,] Sx, [Dz,] [Dy,] Dx, [LeftPz,] [LeftPy,] LeftPx, [RightPy,] [RightPy,] RightPx
-./bin/convnd_bwd_data_xdl 0 1 5 
+./bin/example_convnd_bwd_data_xdl 0 1 5 
 ```
 Result

--- a/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
+++ b/example/17_convnd_bwd_data_xdl/convnd_bwd_data_xdl.cpp
@@ -29,7 +29,7 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto ConvBwdDefault =
-    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization::Default;
 using DeviceConvBwdDataBasePtr =
    ck::tensor_operation::device::DeviceConvBwdDataPtr<InElementOp, WeiElementOp, OutElementOp>;
@@ -44,7 +44,7 @@ using DeviceConvNDBwdDataInstance = ck::tensor_operation::device::
        InElementOp,    // InElementwiseOperation
        WeiElementOp,   // WeiElementwiseOperation
        OutElementOp,   // OutElementwiseOperation
-        ConvBwdDefault, // ConvolutionBackwardDataSpecialization_t
+        ConvBwdDefault, // ConvolutionBackwardDataSpecialization
        NumDimSpatial,  // NumDimSpatial
        256,            // BlockSize
        128,            // MPerBlock

--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -40,7 +40,7 @@ using D0ReduceOp = ck::tensor_operation::element_wise::ReduceSum;
 using D1ReduceOp = ck::tensor_operation::element_wise::ReduceSquareSum;
 static constexpr auto GemmSpecialization =
-    ck::tensor_operation::device::GemmSpecialization_t::Default;
+    ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
 using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle

--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -6,15 +6,9 @@
 #include "hip/hip_fp16.h"
 #endif
-// "Constant" address space for kernel parameter
+// constant address space for kernel parameter
-#define CONSTANT __attribute__((address_space(4)))
+// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
+#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
-// GPU target
-// should enable one and only one GPU target
-#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
-      defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
-#error Need to define (only) one GPU target
-#endif
 // launch bounds
 #define CK_USE_LAUNCH_BOUNDS 1
@@ -24,155 +18,134 @@
 #define CK_MIN_BLOCK_PER_CU 2
 #endif
-// GPU-specific parameters
+// check GPU target
-#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
+#ifdef __HIP_DEVICE_COMPILE__
-    defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A)
+#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
-// buffer resourse
+      defined(__gfx90a__) || defined(__gfx1030__))
+#error Not supported target
+#endif
+#endif
+// buffer resourse, wave size
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_BUFFER_RESOURCE_3RD_DWORD -1
+#define CK_GPU_WAVE_SIZE -1
+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
+    defined(__gfx90a__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
-// wave size
 #define CK_GPU_WAVE_SIZE 64
-#elif defined(CK_AMD_GPU_GFX1030)
+#elif defined(__gfx1030__) // for GPU code
 #define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
 #define CK_GPU_WAVE_SIZE 32
 #endif
 // FMA instruction
-#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
+#ifndef __HIP_DEVICE_COMPILE__                   // for host code, define nothing
+#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
 #define CK_USE_AMD_V_MAC_F32
-#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
+#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || \
-    defined(CK_AMD_GPU_GFX1030)
+    defined(__gfx1030__) // for GPU code
 #define CK_USE_AMD_V_FMAC_F32
 #define CK_USE_AMD_V_DOT2_F32_F16
 #define CK_USE_AMD_V_DOT4_I32_I8
 #endif
-// multi index
+// MFMA instruction
-#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_MFMA
-// AMD inline asm
+#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
-#ifndef CK_USE_AMD_INLINE_ASM
+#define CK_USE_AMD_MFMA
-#define CK_USE_AMD_INLINE_ASM 1
 #endif
-// AMD inner product (DLOP)
+#if defined(__gfx90a__)
-#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
+#define CK_USE_AMD_MFMA_BF16_1K_OP
-#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
 #endif
-// AMD buffer_load
+// buffer load
-#ifndef CK_USE_AMD_BUFFER_LOAD
 #define CK_USE_AMD_BUFFER_LOAD 1
-#endif
-// AMD buffer_store
+// buffer store
-#ifndef CK_USE_AMD_BUFFER_STORE
 #define CK_USE_AMD_BUFFER_STORE 1
-#endif
-// AMD buffer_atomic_add
+// buffer atomic add: integer
-#ifndef CK_USE_AMD_BUFFER_ATOMIC_ADD
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
-#define CK_USE_AMD_BUFFER_ATOMIC_ADD 1
-#endif
-// AMD XDLOPS
+// buffer atomic add: floating point
-#ifndef CK_USE_AMD_XDLOPS
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
-#define CK_USE_AMD_XDLOPS 0
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
+#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
+#else // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif
+// inline asm
+#define CK_USE_AMD_INLINE_ASM 1
+// inner product (DLOP)
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
 // block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
-#ifndef CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
-#define CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
-#endif
-// experimental implementation for buffer load/store/atomic
+// experimental feature: multi index implemented as array
-#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
-#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
-#endif
-#ifndef CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
+// experimental feature: static tensor descriptor
-#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
-#endif
-#ifndef CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
+// experimental feature: buffer load/store/atomic-add OOB trick
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
+#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
-#endif
-// experimental implementation for in-regsiter sub-dword transpose
+// experimental feature: in-regsiter sub-dword transpose
-#ifndef CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE
 #define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
-#endif
-#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
-// merge transformation use magic number division
+// experimental feature: merge transformation use magic number division
-#ifndef CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION
 #define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
-#endif
-// use __builtin_memcpy instead of pointer cast to access a vector from pointer of scalar
+// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
-#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
+// pointer of scalar
 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
-#endif
-// use __builtin_memcpy instead of union to do bit_cast
+// experimental feature: use __builtin_memcpy instead of union to do bit_cast
-#ifndef CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST
 #define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
-#endif
 // hack: have underlying assumption that need to be satsified, otherwise it's a bug
 // hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
 // thread-invariant, otherwise it's a bug
 // TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
-#ifndef CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
 #define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
-#endif
-// workaround for compiler crash when compiling recursive lambda
+// workaround: compiler crash when compiling recursive lambda
-#ifndef CK_WORKAROUND_SWDEV_275126
 #define CK_WORKAROUND_SWDEV_275126 1
-#endif
-// workaround for compiler crash when using buffer load/store for i8
+// workaround: compiler crash when using buffer load/store for i8
-#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE
 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
-#endif
-// workaround for compiler gnerating inefficient ds_write instructions
+// workaround: compiler gnerating inefficient ds_write instructions
-#ifndef CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
 #define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
-#endif
-// workaround for register spill due to compiler issue, when casting type between fp32 and fp16
-#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE
-#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R4_TYPE_CONVERT_ISSUE 1
-#endif
-#ifndef CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE
+// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
-#define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1
-#endif
-// workaround for verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
 // tuning parameter
-#ifndef CK_WORKAROUND_SWDEV_325164
 #define CK_WORKAROUND_SWDEV_325164 1
-#endif
 // workaround for verification failure ConvNd forward
 // https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/135
-#ifndef CK_WORKAROUND_GITHUB_135
 #define CK_WORKAROUND_GITHUB_135 1
-#endif
 namespace ck {
-enum struct InMemoryDataOperationEnum_t
+enum struct InMemoryDataOperationEnum
 {
    Set,
    AtomicAdd,
    Add
 };
-enum struct ActivTypeEnum_t
+// TODO: no longer needed, remove this
+enum struct ActivTypeEnum
 {
    None,
    LeakyRelu,

--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -4,7 +4,7 @@
 namespace ck {
 // StaticTensor for Scalar
-template <AddressSpaceEnum_t AddressSpace,
+template <AddressSpaceEnum AddressSpace,
          typename T,
          typename TensorDesc,
          bool InvalidElementUseNumericalZeroValue,
@@ -80,7 +80,7 @@ struct StaticTensor
 };
 // StaticTensor for vector
-template <AddressSpaceEnum_t AddressSpace,
+template <AddressSpaceEnum AddressSpace,
          typename S,
          index_t ScalarPerVector,
          typename TensorDesc,
@@ -245,7 +245,7 @@ struct StaticTensorTupleOfVectorBuffer
    S ignored_element_scalar_;
 };
-template <AddressSpaceEnum_t AddressSpace,
+template <AddressSpaceEnum AddressSpace,
          typename T,
          typename TensorDesc,
          typename enable_if<TensorDesc::IsKnownAtCompileTime(), bool>::type = false>
@@ -255,7 +255,7 @@ __host__ __device__ constexpr auto make_static_tensor(TensorDesc)
 }
 template <
-    AddressSpaceEnum_t AddressSpace,
+    AddressSpaceEnum AddressSpace,
    typename T,
    typename TensorDesc,
    typename X,

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r2.hpp
@@ -207,9 +207,9 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v2r2_pipeline_2x2
                          CM0M1N0N1ThreadDesc{}.GetLength(I2) == N0,
                      "wrong");
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatA>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_k_m0_m1_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_k_n0_n1_thread_desc_.GetElementSpaceSize());
        constexpr auto threadwise_gemm =

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v2r3.hpp
@@ -220,9 +220,9 @@ struct BlockwiseGemmDlops_A_BK0_BM_BK1_B_BK0_BN_BK1_C_BM0_BM1_BN0_BN1_pipeline_B
                          CThreadDesc_BM0_BM11_BN0_BN11{}.GetLength(I2) == BN0,
                      "wrong");
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatA>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatA>(
            a_thread_desc_bk0_bm0_bm1_bk1_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatB>(
            b_thread_desc_bk0_bn0_bn1_bk1_.GetElementSpaceSize());
        constexpr auto threadwise_contraction =

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_dlops_v3.hpp
@@ -119,7 +119,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3
        constexpr auto a_block_mtx = ABlockDesc_E1_K1_E2{};
        // thread A buffer for GEMM
-        StaticBuffer<AddressSpaceEnum_t::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
+        StaticBuffer<AddressSpaceEnum::Vgpr, FloatA, a_thread_mtx_.GetElementSpaceSize(), true>
            a_thread_buf;
        constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3<FloatA,

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -42,7 +42,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
-    StaticBufferTupleOfVector<AddressSpaceEnum_t::Vgpr,
+    StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
                              FloatAcc,
                              MRepeat * NRepeat,
                              xdlops_gemm.GetRegSizePerXdlops(),
@@ -250,9 +250,9 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                        const BBlockBuffer& b_block_buf,
                        CThreadBuffer& c_thread_buf) const
    {
-        auto a_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
            a_thread_desc_.GetElementSpaceSize());
-        auto b_thread_buf = make_static_buffer<AddressSpaceEnum_t::Vgpr, FloatAB>(
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, FloatAB>(
            b_thread_desc_.GetElementSpaceSize());
        static_for<0, MRepeat, 1>{}([&](auto m0) {

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v4r1.hpp
@@ -16,7 +16,7 @@ namespace ck {
 template <index_t BlockSize,
          typename SrcElementwiseOperation,
          typename DstElementwiseOperation,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename BlockSliceLengths,
          typename ThreadClusterLengths,
          typename ThreadClusterArrangeOrder,

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v5r1.hpp
@@ -14,7 +14,7 @@ namespace ck {
 // 2. ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
 // 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename BlockSliceLengths,
          typename ThreadSliceLengths,
          typename ThreadClusterLengths,

--- a/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_tensor_slice_transfer_v6r1.hpp
@@ -15,7 +15,7 @@ namespace ck {
 // 3. ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
 template <index_t BlockSize,
          typename ElementwiseOperation,
-          InMemoryDataOperationEnum_t DstInMemOp,
+          InMemoryDataOperationEnum DstInMemOp,
          typename BlockSliceLengths,
          typename ThreadClusterLengths,
          typename ThreadClusterArrangeOrder,