Merge branch 'develop' into amd-develop

dddc2115 · Jun Liu · 6e01019b · 08eb1769 · dddc2115 · dddc2115
Commit dddc2115 authored Aug 08, 2023 by Jun Liu
20 changed files
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
@@ -8,7 +8,7 @@
 #include "ck/tensor_operation/gpu/device/impl/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
+#ifdef __int8__
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -123,3 +123,4 @@ void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
+#endif
--- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_elementwise_normalization_instance
    device_elementwise_normalization_f16_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt
@@ -95,36 +95,39 @@ endif()

 add_instance_library(device_gemm_instance ${GEMM_INSTANCES})

-set(ENABLE_PIPELINE_V2_OPT OFF)

-if (ENABLE_PIPELINE_V2_OPT)
-set(MAX_ILP_OPTS
-   -mllvm
-   -amdgpu-enable-max-ilp-scheduling-strategy
-)
-set(WAVES_PER_EU_DEFS
-   CK_USE_WAVES_PER_EU=1
-   CK_MIN_WAVES_PER_EU=1
-   CK_MAX_WAVES_PER_EU=1
-)
-set(IGLP_OPT_DEFS
-   CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
-)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  set(ENABLE_PIPELINE_V2_OPT OFF)
+
+  if (ENABLE_PIPELINE_V2_OPT)
+    set(MAX_ILP_OPTS
+       -mllvm
+       -amdgpu-enable-max-ilp-scheduling-strategy
+    )
+    set(WAVES_PER_EU_DEFS
+       CK_USE_WAVES_PER_EU=1
+       CK_MIN_WAVES_PER_EU=1
+       CK_MAX_WAVES_PER_EU=1
+    )
+    set(IGLP_OPT_DEFS
+       CK_EXPERIMENTAL_PIPELINE_V2_IGLP_OPT=1
+    )

-# layout=NT
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS ";;"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
-# layout=NN
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS "${MAX_ILP_OPTS}"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
-# layout=TT
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS "${MAX_ILP_OPTS}"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS}")
-# layout=TN
-set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
-   COMPILE_OPTIONS "${MAX_ILP_OPTS}"
-   COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
-endif(ENABLE_PIPELINE_V2_OPT)
+    # layout=NT
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS ";;"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=NN
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS "${MAX_ILP_OPTS}"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=TT
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS ";;"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+    # layout=TN
+    set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES
+       COMPILE_OPTIONS "${MAX_ILP_OPTS}"
+       COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}")
+  endif(ENABLE_PIPELINE_V2_OPT)
+endif(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp
@@ -18,7 +18,7 @@ using Instances =
        //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                      |
        //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                      |
        //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                      |
-        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
+        DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     8,  8,   32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Default,  PipelineVersion::v2>
 #endif
        // clang-format on
        >;

--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_gemm_add_add_fastgelu_instance
   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
   device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_gemm_add_fastgelu_instance
   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
   device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_gemm_add_relu_add_layernorm_instance
   device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
   device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
   device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
   device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_gemm_bilinear_instance
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
   device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_gemm_fastgelu_instance
   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
   device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_gemm_streamk_instance
   # device_gemm_xdl_streamk_f32_f32_f32_mk_kn_mn_instance.cpp
   # device_gemm_xdl_streamk_f32_f32_f32_mk_nk_mn_instance.cpp
@@ -8,3 +9,4 @@ add_instance_library(device_gemm_streamk_instance
   # device_gemm_xdl_streamk_f16_f16_f16_km_kn_mn_instance.cpp
   # device_gemm_xdl_streamk_f16_f16_f16_km_nk_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_grouped_gemm_instance
   device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
   device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -8,3 +9,4 @@ add_instance_library(device_grouped_gemm_instance
   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp
   device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
 add_instance_library(device_grouped_gemm_fastgelu_instance
   device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp
   device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp
   device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp
   device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp
 )
+endif()
--- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt
-add_instance_library(device_normalization_instance
-    device_layernorm2d_f16_instance.cpp
-    device_layernorm2d_f32_instance.cpp
+set(DEVICE_NORMALIZATION_INSTANCES)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_NORMALIZATION_INSTANCES device_layernorm2d_f16_instance.cpp
    device_layernorm4d_f16_instance.cpp
-    device_layernorm4d_f32_instance.cpp
    device_groupnorm_f16_instance.cpp
-    device_groupnorm_f32_instance.cpp
    device_groupnorm_swish_f16_instance.cpp
-    device_groupnorm_swish_f32_instance.cpp
-    device_groupnorm_swish_f16_f32_f32_f16_instance.cpp
-)
+    device_groupnorm_swish_f16_f32_f32_f16_instance.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_NORMALIZATION_INSTANCES device_layernorm2d_f32_instance.cpp
+    device_layernorm4d_f32_instance.cpp
+    device_groupnorm_f32_instance.cpp
+    device_groupnorm_swish_f32_instance.cpp)
+endif()
+add_instance_library(device_normalization_instance ${DEVICE_NORMALIZATION_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/pool_fwd/CMakeLists.txt
-add_instance_library(device_pool_fwd_instance
-    device_avg_pool2d_fwd_nhwc_f16_instance.cpp
-    device_avg_pool2d_fwd_nhwc_f32_instance.cpp
-    device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
-    device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
-    device_max_pool2d_fwd_nhwc_f16_instance.cpp
-    device_max_pool2d_fwd_nhwc_f32_instance.cpp
-    device_max_pool3d_fwd_ndhwc_f16_instance.cpp
-    device_max_pool3d_fwd_ndhwc_f32_instance.cpp
-)
+set(DEVICE_POOL_FWD_INSTANCES)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_POOL_FWD_INSTANCES device_avg_pool2d_fwd_nhwc_f16_instance.cpp
+                                          device_avg_pool3d_fwd_ndhwc_f16_instance.cpp
+                                          device_max_pool2d_fwd_nhwc_f16_instance.cpp
+                                          device_max_pool3d_fwd_ndhwc_f16_instance.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_POOL_FWD_INSTANCES device_avg_pool2d_fwd_nhwc_f32_instance.cpp
+                                          device_avg_pool3d_fwd_ndhwc_f32_instance.cpp
+                                          device_max_pool2d_fwd_nhwc_f32_instance.cpp
+                                          device_max_pool3d_fwd_ndhwc_f32_instance.cpp)
+endif()
+add_instance_library(device_pool_fwd_instance ${DEVICE_POOL_FWD_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt
 if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
-set(CONV2D_PERLAYER_QUANT_SRC
-    conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp
-    conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp
-)
-
-set(CONV2D_PERCHANNEL_QUANT_SRC
-    conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp
-    conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp
-)
-
-set(CONV2D_BIAS_PERLAYER_QUANT_SRC
-    conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp
-    conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp
-)
-
-set(CONV2D_BIAS_PERCHANNEL_QUANT_SRC
-    conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp
-    conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp
-)

+set(CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp)
+set(CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp)
+set(CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp)
+set(CONV2D_BIAS_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_xdl_bias_perchannel_quantization_int8_instance.cpp)
 set(GEMM_QUANT_SRC
-    gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
-    gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
-    gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
-    gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
    gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
    gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
    gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
    gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
 )
+if(DL_KERNELS)
+    list(APPEND CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp)
+    list(APPEND CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp)
+    list(APPEND CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp)
+    list(APPEND CONV2D_BIAS_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp)
+    list(APPEND GEMM_QUANT_SRC
+        gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp
+        gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp
+        gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp
+        gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp)
+endif()

 add_instance_library(device_quantization_instance
    ${CONV2D_PERLAYER_QUANT_SRC}

--- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt
-add_instance_library(device_softmax_instance
-    device_softmax_f16_f16_instance_rank3_reduce1.cpp
+set(DEVICE_SOFTMAX_INSTANCES)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_SOFTMAX_INSTANCES device_softmax_f16_f16_instance_rank3_reduce1.cpp
    device_softmax_f16_f16_instance_rank3_reduce2.cpp
    device_softmax_f16_f16_instance_rank3_reduce3.cpp
    device_softmax_f16_f16_instance_rank4_reduce1.cpp
    device_softmax_f16_f16_instance_rank4_reduce2.cpp
    device_softmax_f16_f16_instance_rank4_reduce3.cpp
-    device_softmax_f16_f16_instance_rank4_reduce4.cpp
-    device_softmax_f32_f32_instance_rank3_reduce1.cpp
+    device_softmax_f16_f16_instance_rank4_reduce4.cpp)
+endif()
+if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
+    list(APPEND DEVICE_SOFTMAX_INSTANCES device_softmax_f32_f32_instance_rank3_reduce1.cpp
    device_softmax_f32_f32_instance_rank3_reduce2.cpp
    device_softmax_f32_f32_instance_rank3_reduce3.cpp
    device_softmax_f32_f32_instance_rank4_reduce1.cpp
    device_softmax_f32_f32_instance_rank4_reduce2.cpp
    device_softmax_f32_f32_instance_rank4_reduce3.cpp
-    device_softmax_f32_f32_instance_rank4_reduce4.cpp
-)
+    device_softmax_f32_f32_instance_rank4_reduce4.cpp)
+endif()
+add_instance_library(device_softmax_instance ${DEVICE_SOFTMAX_INSTANCES})
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -136,10 +136,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
    // profile device Conv instances
    bool all_pass = true;

-    std::array<ck::index_t, NDimSpatial> input_spatial_lengths{};
-    std::array<ck::index_t, NDimSpatial> filter_spatial_lengths{};
-    std::array<ck::index_t, NDimSpatial> output_spatial_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
    std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
    std::array<ck::index_t, NDimSpatial + 3> output_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
@@ -148,10 +149,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,

    auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };

-    range_copy(conv_param.input_spatial_lengths_, begin(input_spatial_lengths));
-    range_copy(conv_param.filter_spatial_lengths_, begin(filter_spatial_lengths));
-    range_copy(conv_param.output_spatial_lengths_, begin(output_spatial_lengths));
+    range_copy(in_g_n_c_wis_desc.GetLengths(), begin(input_lengths));
    range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
+    range_copy(wei_g_k_c_xs_desc.GetLengths(), begin(filter_lengths));
+    range_copy(wei_g_k_c_xs_desc.GetStrides(), begin(weights_strides));
+    range_copy(out_g_n_k_wos_desc.GetLengths(), begin(output_lengths));
    range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
    range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
    range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
@@ -164,14 +166,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
            op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
                                        static_cast<WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
                                        static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
-                                        conv_param.G_,
-                                        conv_param.N_,
-                                        conv_param.K_,
-                                        conv_param.C_,
-                                        input_spatial_lengths,
-                                        filter_spatial_lengths,
-                                        output_spatial_lengths,
+                                        input_lengths,
                                        input_strides,
+                                        filter_lengths,
+                                        weights_strides,
+                                        output_lengths,
                                        output_strides,
                                        conv_filter_strides,
                                        conv_filter_dilations,

--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -3,20 +3,11 @@ set(PROFILER_SOURCES
    profiler.cpp
    profile_gemm.cpp
    profile_gemm_splitk.cpp
-    profile_gemm_streamk.cpp
-    profile_gemm_bilinear.cpp
    profile_gemm_bias_add_reduce.cpp
-    profile_gemm_add_add_fastgelu.cpp
    profile_gemm_add_multiply.cpp
-    profile_gemm_add_fastgelu.cpp
-    profile_gemm_add_relu_add_layernorm.cpp
-    profile_gemm_fastgelu.cpp
    profile_gemm_reduce.cpp
    profile_batched_gemm.cpp
-    profile_batched_gemm_gemm.cpp
-    profile_batched_gemm_add_relu_gemm_add.cpp
    profile_batched_gemm_reduce.cpp
-    profile_grouped_gemm.cpp
    profile_conv_fwd.cpp
    profile_conv_fwd_bias_relu.cpp
    profile_conv_fwd_bias_relu_add.cpp
@@ -32,7 +23,6 @@ set(PROFILER_SOURCES
    profile_batchnorm_fwd.cpp
    profile_batchnorm_bwd.cpp
    profile_batchnorm_infer.cpp
-    profile_grouped_gemm_fastgelu.cpp
    profile_contraction_bilinear.cpp
    profile_contraction_scale.cpp
    profile_grouped_conv_bwd_data.cpp
@@ -40,6 +30,18 @@ set(PROFILER_SOURCES
 if(DL_KERNELS)
  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
 endif()
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
+endif()

 set(PROFILER_EXECUTABLE ckProfiler)

@@ -49,20 +51,11 @@ target_compile_options(${PROFILER_EXECUTABLE} PRIVATE -Wno-global-constructors)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
@@ -79,13 +72,24 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_instan
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
 if(DL_KERNELS)
  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
 endif()
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
-target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
+if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
+endif()
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
--- a/profiler/src/profile_gemm.cpp
+++ b/profiler/src/profile_gemm.cpp
@@ -121,7 +121,10 @@ int profile_gemm(int argc, char* argv[])
        return pass ? 0 : 1;
    };

-    if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
+    if(false)
+        ;
+#ifdef __fp32__
+    else if(data_type == GemmDataType::F32_F32_F32 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, F32{}, F32{}, F32{}, F32{});
    }
@@ -137,6 +140,8 @@ int profile_gemm(int argc, char* argv[])
    {
        return profile(Col{}, Col{}, Row{}, F32{}, F32{}, F32{}, F32{});
    }
+#endif
+#ifdef __fp16__
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(Row{}, Row{}, Row{}, F16{}, F16{}, F32{}, F16{});
@@ -153,6 +158,7 @@ int profile_gemm(int argc, char* argv[])
    {
        return profile(Col{}, Col{}, Row{}, F16{}, F16{}, F32{}, F16{});
    }
+#endif
 #ifdef __bf16__
    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
    {

--- a/profiler/src/profile_grouped_gemm.cpp
+++ b/profiler/src/profile_grouped_gemm.cpp
@@ -88,7 +88,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    const auto StrideBs = argToIntArray(argv[12]);
    const auto StrideCs = argToIntArray(argv[13]);
    const int kbatch    = argc == 15 ? std::stoi(argv[14]) : 1;
-
+#ifdef __fp16__
    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        ck::profiler::profile_grouped_gemm_impl<ck::half_t,
@@ -173,7 +173,7 @@ int profile_grouped_gemm(int argc, char* argv[])
    {
        throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
    }
-
+#endif
    return 0;
 }